This workshop is mainly adapted from the mixOmics tutorials (http://mixomics.org) and the Bioconductor vignette (https://bioconductor.org/packages/release/bioc/html/mixOmics.html).
We saw previously, methods for discriminating experimental groups (eg. sPLSDA), and methods seeking correlations between features across ’omic layers.
Data Integration Analysis for Biomarker discovery using Latent cOmponents (DIABLO) - aims to do both; group discrimination and use multi-omic correlation structures.
The DIABLO publication provides a deeper explanation of this method: https://doi.org/10.1093/bioinformatics/bty1054
library(mixOmics)
## Loading required package: MASS
## Loading required package: lattice
## Loading required package: ggplot2
##
## Loaded mixOmics 6.24.0
## Thank you for using mixOmics!
## Tutorials: http://mixomics.org
## Bookdown vignette: https://mixomicsteam.github.io/Bookdown
## Questions, issues: Follow the prompts at http://mixomics.org/contact-us
## Cite us: citation('mixOmics')
data(breast.TCGA)
# Extract training data and name each data frame
# There are 3 dataframes (3 omics of the same samples), stored as list
# In each dataframe, the rows represent samples.
# Columns represent feature measurments (mRNA, miRNA, protein)
X <- list(miRNA = breast.TCGA$data.train$mirna,
mRNA = breast.TCGA$data.train$mrna,
protein = breast.TCGA$data.train$protein)
Y <- breast.TCGA$data.train$subtype
table(Y)
## Y
## Basal Her2 LumA
## 45 30 75
The ‘design’ matrix input for DIABLO describes the relationship between the input dataframes. Expected pairwise linkage is expressed as a number in the range 0-1, where 1 represents full strength. Prior biological knowledge may affect the design, e.g. here we do expect generally miRNA regulates mRNA which corresponds to protein expression.
res1.pls.tcga <- pls(X$mRNA, X$protein, ncomp = 1)
cor(res1.pls.tcga$variates$X, res1.pls.tcga$variates$Y)
## comp1
## comp1 0.9031761
res2.pls.tcga <- pls(X$mRNA, X$miRNA, ncomp = 1)
cor(res2.pls.tcga$variates$X, res2.pls.tcga$variates$Y)
## comp1
## comp1 0.8456299
res3.pls.tcga <- pls(X$protein, X$miRNA, ncomp = 1)
cor(res3.pls.tcga$variates$X, res3.pls.tcga$variates$Y)
## comp1
## comp1 0.7982008
# set up a matrix of values corresponding to the datasets
design <- matrix(0.1, ncol = length(X), nrow = length(X),
dimnames = list(names(X), names(X)))
diag(design) <- 0
design
## miRNA mRNA protein
## miRNA 0.0 0.1 0.1
## mRNA 0.1 0.0 0.1
## protein 0.1 0.1 0.0
Checking correlation with pls with 1 component suggests the dataframes are indeed correlated. A design with weights ∼0.8 could be chosen. However, in this demo we will put a low value 0.1, which will assist with predictions of test data later.
diablo.tcga <- block.plsda(X, Y, ncomp = 5, design = design)
set.seed(123) # For reproducibility, remove for your analyses
perf.diablo.tcga = perf(diablo.tcga, validation = 'Mfold',
folds = 6, nrepeat = 30, cpus = 3)
# save that result as an rda file
saveRDS(perf.diablo.tcga, file = "perf.diablo.tcga.rds")
perf.diablo.tcga <- readRDS("perf.diablo.tcga.rds")
perf.diablo.tcga$error.rate # Lists the different types of error rates
## $miRNA
## max.dist centroids.dist mahalanobis.dist
## comp1 0.2413333 0.2380000 0.2380000
## comp2 0.1580000 0.1546667 0.1500000
## comp3 0.1333333 0.1420000 0.1213333
## comp4 0.1273333 0.1526667 0.1120000
## comp5 0.1106667 0.1480000 0.1193333
##
## $mRNA
## max.dist centroids.dist mahalanobis.dist
## comp1 0.20000000 0.09933333 0.09933333
## comp2 0.04866667 0.04466667 0.05333333
## comp3 0.02933333 0.02466667 0.03733333
## comp4 0.02800000 0.02733333 0.03266667
## comp5 0.03266667 0.03200000 0.03066667
##
## $protein
## max.dist centroids.dist mahalanobis.dist
## comp1 0.22066667 0.17066667 0.17066667
## comp2 0.09866667 0.09733333 0.10466667
## comp3 0.08466667 0.08666667 0.08400000
## comp4 0.06933333 0.09133333 0.07266667
## comp5 0.07533333 0.08333333 0.07000000
perf.diablo.tcga$choice.ncomp$WeightedVote
## max.dist centroids.dist mahalanobis.dist
## Overall.ER 3 3 3
## Overall.BER 3 3 3
# Plot of the error rates based on weighted vote
plot(perf.diablo.tcga)
ncomp <- perf.diablo.tcga$choice.ncomp$WeightedVote["Overall.BER", "centroids.dist"]
ncomp <- 2 # manually chose 2 components
ncomp
set.seed(123) # for reproducibility
test.keepX <- list(miRNA = c(5:7, round(2^(seq(3,5,0.5)))),
mRNA = c(5:7, round(2^(seq(3,5,0.5)))),
proteomics = c(5:7, round(2^(seq(3,5,0.5)))))
# (slow step)
tune.diablo.tcga <- tune.block.splsda(X, Y, ncomp = ncomp,
test.keepX = test.keepX, design = design,
validation = 'Mfold', folds = 6, nrepeat = 3,
BPPARAM = BiocParallel::SnowParam(workers = 3),
dist = "centroids.dist")
# save that result as an rda file
saveRDS(tune.diablo.tcga, file = "tune.diablo.tcga.rds")
list.keepX <- tune.diablo.tcga$choice.keepX
list.keepX
To speed up the demonstration we can skip this and plug in the resulting values:
ncomp <- 2
list.keepX <- list(miRNA = c(23,6), mRNA = c(32, 23), protein = c(7, 5))
#list.keepX <- list(miRNA = c(14,5), mRNA = c(8, 25), protein = c(10, 5))
Proceeding to make the DIABLO model with the optimised settings:
diablo.tcga <- block.splsda(X, Y, ncomp = ncomp,
keepX = list.keepX, design = design)
## Design matrix has changed to include Y; each block will be
## linked to Y.
# inspect the miRNA/mRNA/protein variables selected on component 1
selectVar(diablo.tcga, block = 'miRNA', comp = 1)
## $miRNA
## $miRNA$name
## [1] "hsa-mir-17" "hsa-mir-590" "hsa-mir-505" "hsa-mir-130b"
## [5] "hsa-mir-106b" "hsa-mir-20a" "hsa-mir-106a" "hsa-mir-197"
## [9] "hsa-mir-1301" "hsa-mir-186" "hsa-mir-93" "hsa-let-7d"
## [13] "hsa-mir-532" "hsa-mir-146a" "hsa-mir-9-2" "hsa-mir-501"
## [17] "hsa-mir-9-1" "hsa-mir-19b-2" "hsa-mir-92a-2" "hsa-mir-455"
## [21] "hsa-mir-378" "hsa-mir-1307" "hsa-mir-660"
##
## $miRNA$value
## value.var
## hsa-mir-17 0.450459738
## hsa-mir-590 0.388446859
## hsa-mir-505 0.375843695
## hsa-mir-130b 0.353924564
## hsa-mir-106b 0.281217527
## hsa-mir-20a 0.271874982
## hsa-mir-106a 0.267189204
## hsa-mir-197 0.167827031
## hsa-mir-1301 0.156210998
## hsa-mir-186 0.153972385
## hsa-mir-93 0.132707811
## hsa-let-7d 0.118596410
## hsa-mir-532 0.101450139
## hsa-mir-146a 0.101429290
## hsa-mir-9-2 0.087893745
## hsa-mir-501 0.082737637
## hsa-mir-9-1 0.077565887
## hsa-mir-19b-2 0.054862710
## hsa-mir-92a-2 0.039207676
## hsa-mir-455 0.029867451
## hsa-mir-378 0.023989869
## hsa-mir-1307 0.009817260
## hsa-mir-660 0.007376438
##
##
## $comp
## [1] 1
selectVar(diablo.tcga, block = 'mRNA', comp = 1)
## $mRNA
## $mRNA$name
## [1] "ZNF552" "KDM4B" "CCNA2" "LRIG1" "PREX1" "FUT8" "C4orf34"
## [8] "TTC39A" "ASPM" "SLC43A3" "MEX3A" "SEMA3C" "E2F1" "STC2"
## [15] "FMNL2" "LMO4" "DTWD2" "MED13L" "CSRP2" "NTN4" "KIF13B"
## [22] "NCAPG2" "SLC19A2" "EPHB3" "FAM63A" "HTRA1" "MEGF9" "SLC5A6"
## [29] "MTL5" "SNORA8" "SIGIRR" "IFITM2"
##
## $mRNA$value
## value.var
## ZNF552 -0.404203617
## KDM4B -0.352853177
## CCNA2 0.301372510
## LRIG1 -0.288001223
## PREX1 -0.284545868
## FUT8 -0.277397602
## C4orf34 -0.269595501
## TTC39A -0.258497759
## ASPM 0.207699848
## SLC43A3 0.200004231
## MEX3A 0.188053146
## SEMA3C -0.175388833
## E2F1 0.133969965
## STC2 -0.112825107
## FMNL2 0.105693521
## LMO4 0.095826529
## DTWD2 -0.090895663
## MED13L -0.087465685
## CSRP2 0.076251574
## NTN4 -0.073581363
## KIF13B -0.061111872
## NCAPG2 0.060658224
## SLC19A2 -0.046302846
## EPHB3 0.045155303
## FAM63A -0.029928269
## HTRA1 -0.018464698
## MEGF9 -0.017264097
## SLC5A6 0.013335446
## MTL5 -0.008781620
## SNORA8 0.008464001
## SIGIRR -0.004075133
## IFITM2 -0.003493906
##
##
## $comp
## [1] 1
selectVar(diablo.tcga, block = 'protein', comp = 1)
## $protein
## $protein$name
## [1] "ER-alpha" "GATA3" "ASNS" "Cyclin_B1" "AR" "Cyclin_E1"
## [7] "JNK2"
##
## $protein$value
## value.var
## ER-alpha -0.69971601
## GATA3 -0.60820072
## ASNS 0.22606879
## Cyclin_B1 0.21298048
## AR -0.17639751
## Cyclin_E1 0.08789422
## JNK2 -0.07197303
##
##
## $comp
## [1] 1
For each component #, view the separation of experimental groups and the correlation of that component # from each block.
plotDiablo(diablo.tcga, ncomp = 1)
plotDiablo(diablo.tcga, ncomp = 2)
View all samples, plotted by the leading components in each block.
plotIndiv(diablo.tcga, ind.names = FALSE, legend = TRUE, comp=1:2,
title = 'TCGA, DIABLO comp 1 - 2')
plotArrow(diablo.tcga, ind.names = FALSE, legend = TRUE,
title = 'TCGA, DIABLO comp 1 - 2')
plotVar(diablo.tcga, var.names = FALSE, style = 'graphics', legend = TRUE,
pch = c(16, 17, 15), cex = c(1.5,1.5,1.5),
col = color.mixo(4:6),
title = 'TCGA, DIABLO comp 1 - 2')
# two components, cutoff 0.6
circosPlot(diablo.tcga, cutoff = 0.6, line = TRUE, comp = 1:2,
color.blocks = color.mixo(4:6), color.cor = color.mixo(7:8),
size.labels = 1.5)
# component 1 only, cutoff 0.7
circosPlot(diablo.tcga, cutoff = 0.7, line = TRUE, comp = 1,
color.blocks = color.mixo(4:6), color.cor = color.mixo(7:8),
size.labels = 1.5, size.variables = 0.6)
network(diablo.tcga, blocks = c(1,2,3),
cutoff = 0.4,
color.node = color.mixo(4:6),
save = 'png', name.save = 'diablo-network'
)
Some adjustments for clarity…
network(diablo.tcga, blocks = c(1,2,3),
cutoff = 0.7,graph.scale = 0.3,
color.node = color.mixo(4:6),
save = 'png', name.save = 'diablo-network2'
)
Shown here for comp1, but check also for comp=2, comp=3.
plotLoadings(diablo.tcga, comp = 1, contrib = 'max', method = 'median')
plotLoadings(diablo.tcga, comp = 2, contrib = 'max', method = 'median')
cimDiablo(diablo.tcga, color.blocks = color.mixo(4:6),
comp = 1, margin=c(6,3), legend.position = "right",
dist.method = c("canberra", "canberra"),
row.cex = 0.25, col.cex = 0.5,
save="png", name.save="cim.diablo.comp1")
cimDiablo(diablo.tcga, color.blocks = color.mixo(4:6),
comp = 2, margin=c(6,3), legend.position = "right",
dist.method = c("canberra", "canberra"),
row.cex = 0.25, col.cex = 0.5,
save="png", name.save="cim.diablo.comp2")
Re-check model performance with cross-validation.
set.seed(123) # For reproducibility, remove otherwise
perf.diablo.tcga <- perf(diablo.tcga, validation = 'Mfold', folds = 6, cpus = 3,
nrepeat = 30, dist = 'centroids.dist')
# save that result as an rda file
saveRDS(perf.diablo.tcga, file = "perf.diablo.tcga.tuned.rds")
perf.diablo.tcga # Lists the different outputs
# Performance with Majority vote
perf.diablo.tcga$MajorityVote.error.rate
## $max.dist
## comp1 comp2 comp3 comp4 comp5
## Basal 0.02444444 0.03777778 0.02444444 0.04222222 0.057777778
## Her2 1.00000000 0.29666667 0.21333333 0.19333333 0.173333333
## LumA 0.00000000 0.00400000 0.00000000 0.00000000 0.001333333
## Overall.ER 0.20733333 0.07266667 0.05000000 0.05133333 0.052666667
## Overall.BER 0.34148148 0.11281481 0.07925926 0.07851852 0.077481481
##
## $centroids.dist
## comp1 comp2 comp3 comp4 comp5
## Basal 0.06888889 0.04888889 0.02888889 0.04666667 0.04444444
## Her2 0.26333333 0.11333333 0.08000000 0.10000000 0.12000000
## LumA 0.06800000 0.05733333 0.02800000 0.02800000 0.02400000
## Overall.ER 0.10733333 0.06600000 0.03866667 0.04800000 0.04933333
## Overall.BER 0.13340741 0.07318519 0.04562963 0.05822222 0.06281481
##
## $mahalanobis.dist
## comp1 comp2 comp3 comp4 comp5
## Basal 0.06888889 0.05111111 0.01777778 0.04000000 0.05111111
## Her2 0.26333333 0.12333333 0.11666667 0.08666667 0.10000000
## LumA 0.06800000 0.06666667 0.05066667 0.04000000 0.01600000
## Overall.ER 0.10733333 0.07333333 0.05400000 0.04933333 0.04333333
## Overall.BER 0.13340741 0.08037037 0.06170370 0.05555556 0.05570370
# Performance with Weighted vote
perf.diablo.tcga$WeightedVote.error.rate
## $max.dist
## comp1 comp2 comp3 comp4 comp5
## Basal 0.02444444 0.02888889 0.02444444 0.03555556 0.048888889
## Her2 1.00000000 0.25666667 0.17666667 0.16666667 0.163333333
## LumA 0.00000000 0.00400000 0.00000000 0.00000000 0.001333333
## Overall.ER 0.20733333 0.06200000 0.04266667 0.04400000 0.048000000
## Overall.BER 0.34148148 0.09651852 0.06703704 0.06740741 0.071185185
##
## $centroids.dist
## comp1 comp2 comp3 comp4 comp5
## Basal 0.06888889 0.03111111 0.02888889 0.04444444 0.04222222
## Her2 0.23666667 0.08000000 0.05333333 0.08333333 0.10000000
## LumA 0.06800000 0.05600000 0.02666667 0.02800000 0.02400000
## Overall.ER 0.10200000 0.05333333 0.03266667 0.04400000 0.04466667
## Overall.BER 0.12451852 0.05570370 0.03629630 0.05192593 0.05540741
##
## $mahalanobis.dist
## comp1 comp2 comp3 comp4 comp5
## Basal 0.06888889 0.03333333 0.008888889 0.03111111 0.04000000
## Her2 0.23666667 0.09333333 0.086666667 0.07666667 0.09666667
## LumA 0.06800000 0.06266667 0.045333333 0.03600000 0.01600000
## Overall.ER 0.10200000 0.06000000 0.042666667 0.04266667 0.03933333
## Overall.BER 0.12451852 0.06311111 0.046962963 0.04792593 0.05088889
auc.diablo.miRNA <- auroc(diablo.tcga, roc.block = "miRNA", roc.comp = 2,
print = FALSE)
auc.diablo.mRNA <- auroc(diablo.tcga, roc.block = "mRNA", roc.comp = 2,
print = FALSE)
auc.diablo.protein <- auroc(diablo.tcga, roc.block = "protein", roc.comp = 2,
print = FALSE)
The next test will be if we delete the protein data layer, can the model still predict the cancer subtype?
# Prepare test set data: here one block (proteins) is missing
data.test.tcga <- list(miRNA = breast.TCGA$data.test$mirna,
mRNA = breast.TCGA$data.test$mrna)
predict.diablo.tcga <- predict(diablo.tcga, newdata = data.test.tcga,
dist = "centroids.dist")
## Warning in predict.block.spls(diablo.tcga, newdata = data.test.tcga, dist =
## "centroids.dist"): Some blocks are missing in 'newdata'; the prediction is
## based on the following blocks only: miRNA, mRNA
# The warning message will inform us that one block is missing
predict.diablo.tcga # List the different outputs
##
## Call:
## predict.block.spls(object = diablo.tcga, newdata = data.test.tcga, dist = "centroids.dist")
##
## Main numerical outputs:
## --------------------
## Prediction values of the test samples for each block and each component: see object$predict
## variates of the test samples for each block and each component: see object$variates
## Classification of the test samples for each distance, for each block and each component: see object$class
## Majority vote of the test samples for each distance and each component: see object$vote
confusion.mat.tcga <- get.confusion_matrix(truth = breast.TCGA$data.test$subtype,
predicted = predict.diablo.tcga$WeightedVote$centroids.dist[,2])
confusion.mat.tcga
## predicted.as.Basal predicted.as.Her2 predicted.as.LumA
## Basal 20 1 0
## Her2 0 14 0
## LumA 0 1 34
get.BER(confusion.mat.tcga)
## [1] 0.02539683
confusion.df.tcga <- reshape2::melt(confusion.mat.tcga)
confusion.df.tcga
## Var1 Var2 value
## 1 Basal predicted.as.Basal 20
## 2 Her2 predicted.as.Basal 0
## 3 LumA predicted.as.Basal 0
## 4 Basal predicted.as.Her2 1
## 5 Her2 predicted.as.Her2 14
## 6 LumA predicted.as.Her2 1
## 7 Basal predicted.as.LumA 0
## 8 Her2 predicted.as.LumA 0
## 9 LumA predicted.as.LumA 34
TrueClass <- factor(rep(c("Basal", "Her2", "LumA"),3))
PredictedClass <- factor(c(rep("Basal",3), rep("Her2",3), rep("LumA",3)))
PredictionCount <- confusion.df.tcga$value
confusion.df.tcga <- as.data.frame(confusion.mat.tcga)
plotData <- data.frame(TrueClass, PredictedClass, PredictionCount)
plotData$goodbad <- "bad"
plotData$goodbad[plotData$TrueClass==plotData$PredictedClass] <- "good"
plotData$proportion <- plotData$PredictionCount / sum(plotData$PredictionCount)
plotData
## TrueClass PredictedClass PredictionCount goodbad proportion
## 1 Basal Basal 20 good 0.28571429
## 2 Her2 Basal 0 bad 0.00000000
## 3 LumA Basal 0 bad 0.00000000
## 4 Basal Her2 1 bad 0.01428571
## 5 Her2 Her2 14 good 0.20000000
## 6 LumA Her2 1 bad 0.01428571
## 7 Basal LumA 0 bad 0.00000000
## 8 Her2 LumA 0 bad 0.00000000
## 9 LumA LumA 34 good 0.48571429
ggplot(data = plotData, mapping = aes(x = TrueClass, y = PredictedClass,
fill = goodbad, alpha = proportion)) +
geom_tile(show.legend = FALSE, colour="white", linewidth=0.5) +
geom_text(aes(label = PredictionCount), vjust = .5, fontface = "bold", alpha = 1, size=6) +
scale_fill_manual(values = c(good = "green", bad = "red")) +
theme_classic() + xlab("True Subtype") + ylab("Predicted Subtype") +
theme(axis.text = element_text(face="bold", size=14)) +
xlim(rev(levels(plotData$TrueClass))) + ggtitle("Confusion Matrix")